# Computations
import numpy as np
import pandas as pd
# scipy
from scipy.stats import norm
# preprocessing
from sklearn import preprocessing
import re
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## WordCloud
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com
This data is fictional and it is created by IBM data scientists.
Categorical Parameters:
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| Education | Below College | College | Bachelor | Master | Doctor |
| Environment Satisfaction | Low | Medium | High | Very High | |
| Job Involvement | Low | Medium | High | Very High | |
| Job Satisfaction | Low | Medium | High | Very High | |
| Performance Rating | Low | Good | Excellent | Outstanding | |
| Relationship Satisfaction | Low | Medium | High | Very High | |
| WorkLife Balance | Bad | Good | Better | Best |
This can be encoded as follows,
Categorical_Dict = {'Education': {1:'Below College', 2:'College',3:'Bachelor', 4: 'Master', 5:'Doctor'},
'Environment Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Involvement': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Performance Rating': {1:'Low', 2:'Good', 3:'Excellent', 4:'Outstanding'},
'Relationship Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Work Life Balance': {1:'Bad', 2:'Good', 3:'Better', 4:'Best'}}
Path = 'Data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx'
Data = pd.read_excel(Path)
Temp = [re.sub(r"(\w)([A-Z])", r"\1 \2", x) for x in Data.columns]
Temp = [x.replace(' Curr ', ' Current ').replace('18',' 18').replace('Num ','Number Of ') for x in Temp]
Data.columns = Temp
del Temp
Data['Business Travel'] = Data['Business Travel'].str.replace('_',' ')
display(Data.head(8).style.hide_index())
Target = 'Attrition'
Featured_Columns = list(set(Data.columns) - {Target, 'Employee Number'})
| Age | Attrition | Business Travel | Daily Rate | Department | Distance From Home | Education | Education Field | Employee Count | Employee Number | Environment Satisfaction | Gender | Hourly Rate | Job Involvement | Job Level | Job Role | Job Satisfaction | Marital Status | Monthly Income | Monthly Rate | Number Of Companies Worked | Over 18 | Over Time | Percent Salary Hike | Performance Rating | Relationship Satisfaction | Standard Hours | Stock Option Level | Total Working Years | Training Times Last Year | Work Life Balance | Years At Company | Years In Current Role | Years Since Last Promotion | Years With Current Manager |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | Yes | Travel Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 49 | No | Travel Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 37 | Yes | Travel Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 33 | No | Travel Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 27 | No | Travel Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| 32 | No | Travel Frequently | 1005 | Research & Development | 2 | 2 | Life Sciences | 1 | 8 | 4 | Male | 79 | 3 | 1 | Laboratory Technician | 4 | Single | 3068 | 11864 | 0 | Y | No | 13 | 3 | 3 | 80 | 0 | 8 | 2 | 2 | 7 | 7 | 3 | 6 |
| 59 | No | Travel Rarely | 1324 | Research & Development | 3 | 3 | Medical | 1 | 10 | 3 | Female | 81 | 4 | 1 | Laboratory Technician | 1 | Married | 2670 | 9964 | 4 | Y | Yes | 20 | 4 | 1 | 80 | 3 | 12 | 3 | 2 | 1 | 0 | 0 | 0 |
| 30 | No | Travel Rarely | 1358 | Research & Development | 24 | 1 | Life Sciences | 1 | 11 | 4 | Male | 67 | 3 | 1 | Laboratory Technician | 3 | Divorced | 2693 | 13335 | 1 | Y | No | 22 | 4 | 2 | 80 | 1 | 1 | 2 | 3 | 1 | 0 | 0 | 0 |
First off, let's take a look at the dataset
def Data_Plot(Inp, W = False):
data_info = Inp.copy()
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if W:
fig.update_layout(width = W)
fig.show()
return data_info
_ = Data_Plot(Data)
Moreover,
def Distinct_Observations(Inp, Target = Target, Featured_Columns = None, YL = None):
if Featured_Columns == None:
Featured_Columns = list(set(Data.columns) - {Target})
Temp = Inp[Featured_Columns].nunique()
fig = go.Figure([go.Bar(x=Temp.index, y=Temp.values)])
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1, opacity=1, showlegend = False)
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= 450, width= 980,
title={'text': '<b>' + 'Distinct Observations in Each Column' + '<b>', 'x':0.5,
'y': 0.92, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
if not YL == None:
fig.update_yaxes(range =[0, YL])
fig.show()
return Temp
Temp = Distinct_Observations(Inp = Data, Featured_Columns = Featured_Columns, YL = 1500)
Temp.loc[Temp>5].sort_index().to_frame('Distinct Observations')
| Distinct Observations | |
|---|---|
| Age | 43 |
| Daily Rate | 886 |
| Distance From Home | 29 |
| Education Field | 6 |
| Hourly Rate | 71 |
| Job Role | 9 |
| Monthly Income | 1349 |
| Monthly Rate | 1427 |
| Number Of Companies Worked | 10 |
| Percent Salary Hike | 15 |
| Total Working Years | 40 |
| Training Times Last Year | 7 |
| Years At Company | 37 |
| Years In Current Role | 19 |
| Years Since Last Promotion | 16 |
| Years With Current Manager | 18 |
## Attrition Colormap
Att_Colors = ['LightSalmon', 'LightBlue']
Att_LC = 'Black'
# Gender Colormap
MF_Colors = ['HotPink', 'RoyalBlue']
MF_LC = 'Navy'
# Education
Ed_Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen']
Ed_LC = 'Black'
def DistPlot(Feat, yLim = [0, 80], H = 450, titleY = 0.92):
fig = px.histogram(Data, x = Feat, color='Attrition', marginal= 'box',
color_discrete_sequence= Att_Colors, hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
Name = '%s Distribution by Attrition' % Feat
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= H, width= 980,
title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= Att_LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=yLim)
fig.show()
def PlotX1(df, Feat, ColorFeat = 'Gender', yLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
# x_title = Feat,
y_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Left
fig1 = px.bar(df.loc[df.Attrition == 'No'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Right
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=1, col=2)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=1, col=2)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= yLim)
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def PlotX2(df, Feat, ColorFeat = 'Education', yLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
# x_title = Feat,
y_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Left
fig1 = px.bar(df.loc[df.Attrition == 'No'], x= Feat, y= 'Percentage', orientation='v', barmode='group',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Right
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=1, col=2)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=1, col=2)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= yLim)
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def PlotY1(df, Feat, ColorFeat = 'Gender', xLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.08, shared_yaxes=True,
# y_title = Feat,
x_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Top
fig1 = px.bar(df.loc[df.Attrition == 'No'], y= Feat, x= 'Percentage', orientation='h',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], y= Feat, x= 'Percentage', orientation='h',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = MF_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=2, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def PlotY2(df, Feat, ColorFeat = 'Education', xLim = [0, 35], H = 500, titleY = 0.90):
fig = make_subplots(rows=2, cols=1, vertical_spacing = 0.08, shared_yaxes=True,
# y_title = Feat,
x_title = 'Percent',
subplot_titles=('Attrition: No', 'Attrition: Yes'))
# Top
fig1 = px.bar(df.loc[df.Attrition == 'No'], y= Feat, x= 'Percentage', orientation='h', barmode='group',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, row=1, col=1)
# Bottom
fig2 = px.bar(df.loc[df.Attrition == 'Yes'], y= Feat, x= 'Percentage', orientation='h',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Ed_Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=2, col=1)
fig.update_traces(marker_line_color= MF_LC, marker_line_width=1, opacity=1, showlegend = False, row=2, col=1)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and Attrition' % (Feat, ColorFeat)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
# For plotting
def FeatCut(Feat, Bins, ColorFeat = 'Gender', Inp = Data):
Out = Inp[[Feat, ColorFeat,'Attrition']]
Out[Feat] = pd.cut(Out[Feat], bins = pd.IntervalIndex.from_tuples([(x, y) for x, y in zip(Bins[:-1],Bins[1:])]))
Out = Out.groupby([Feat, ColorFeat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Out['Percentage'] = np.round(100* Out.values /Out.sum().values, 2)
Out.reset_index(drop = False, inplace = True)
Out = Out.sort_values(by=[Feat])
Out[Feat] = Out[Feat].astype(str)
return Out
def Table(Feat, ColorFeat = 'Gender', Inp = Data):
Out = Inp[[Feat, ColorFeat,'Attrition']]
Out = Out.groupby([Feat, ColorFeat,'Attrition'])['Attrition'].agg({'count'}).rename(columns = {'count':'Count'})
Out['Percentage'] = np.round(100* Out.values /Out.sum().values, 2)
Out.reset_index(drop = False, inplace = True)
Out = Out.sort_values(by=[Feat])
Out[Feat] = Out[Feat].astype(str)
return Out
# For preprocessing
def ColBins(Inp, Bins):
Out = pd.cut(Inp, bins = pd.IntervalIndex.from_tuples([(x, y) for x, y in zip(Bins[:-1],Bins[1:])]))
Temp = np.sort(Out.astype('str').unique())
Dict = dict(zip(Temp, np.arange(len(Temp))))
Out = Out.astype('str').replace(Dict)
return Out
# A copy of the dataset
df = Data.copy()
# Modifying the dataset for plotting only
for Feat in Categorical_Dict.keys():
Data [Feat] = Data[Feat].replace(Categorical_Dict[Feat])
del Feat
Feat = 'Age'
Bins = [15, 24, 40, 59, 80]
DistPlot(Feat)
PlotX1(df = FeatCut(Feat, Bins = Bins), Feat = Feat)
PlotX2(df = FeatCut(Feat, Bins, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 20])
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Feat, Bins
Feat = 'Business Travel'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 40], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 25], H = 500, titleY = 0.90)
del Feat
Feat = 'Daily Rate'
Bins = [100, 300, 600, 1000, 1500]
DistPlot(Feat, yLim = [0, 250], H = 450, titleY = 0.92)
PlotX1(df = FeatCut(Feat, Bins = Bins), Feat = Feat, yLim = [0, 20])
PlotX2(df = FeatCut(Feat, Bins, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 12])
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Feat
Feat = 'Department'
PlotY1(df = Table(Feat), Feat = Feat, xLim = [0, 40], H = 550, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 25])
del Feat
Feat = 'Distance From Home'
Bins = [0, 5, 10, 20, 30]
DistPlot(Feat, yLim = [0, 250], H = 450, titleY = 0.92)
PlotX1(df = FeatCut(Feat, Bins = Bins), Feat = Feat, yLim = [0, 30])
PlotX2(df = FeatCut(Feat, Bins, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 20])
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Feat
Feat = 'Education'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 25])
del Feat
Feat = 'Education Field'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 25])
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 14])
del Feat
Feat = 'Environment Satisfaction'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 20])
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 12])
del Feat
Feat = 'Hourly Rate'
Bins = [25, 50, 75, 101]
DistPlot(Feat, yLim = [0, 140], H = 500, titleY = 0.92)
PlotX1(df = FeatCut(Feat, Bins = Bins), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
PlotX2(df = FeatCut(Feat, Bins, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 14], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Bins, Feat
Feat = 'Job Involvement'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
del Feat
Feat = 'Job Level'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
del Feat
Feat = 'Job Role'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 8], H = 500, titleY = 0.90)
del Feat
Feat = 'Job Satisfaction'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 10], H = 500, titleY = 0.90)
del Feat
Feat = 'Marital Status'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 25], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 16], H = 500, titleY = 0.90)
del Feat
Feat = 'Monthly Income'
Bins = [1e3, 3e3, 7e3, 1e4, 2e4]
Bins = [int(x) for x in Bins]
DistPlot(Feat, yLim = [0, 400], H = 500, titleY = 0.92)
PlotX1(df = FeatCut(Feat, Bins = Bins), Feat = Feat, yLim = [0, 25], H = 500, titleY = 0.90)
PlotX2(df = FeatCut(Feat, Bins, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 14], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Bins, Feat
Feat = 'Monthly Rate'
Bins = [2e3, 1e4, 2e4, 3e4]
Bins = [int(x) for x in Bins]
DistPlot(Feat, yLim = [0, 80], H = 500, titleY = 0.92)
PlotX1(df = FeatCut(Feat, Bins = Bins), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
PlotX2(df = FeatCut(Feat, Bins, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 14], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Bins, Feat
Feat = 'Number Of Companies Worked'
Bins = [-1, 2, 5, 10]
DistPlot(Feat = Feat, yLim = [0, 600], H = 450, titleY = 0.92)
Temp = FeatCut(Feat, Bins = Bins).replace({'(-1, 2]': '[0, 2]'})
PlotX1(df = Temp, Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
Temp = FeatCut(Feat, Bins, ColorFeat = 'Education').replace({'(-1, 2]': '[0, 2]'})
PlotX2(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Temp, Feat
Feat = 'Over Time'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 40], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
del Feat
Bins = list(np.arange(10, 27, 4))
Feat = 'Percent Salary Hike'
PlotX1(df = FeatCut(Feat, Bins = Bins), Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
PlotX2(df = FeatCut(Feat, Bins, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Feat
Feat = 'Performance Rating'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 50], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
del Feat
Feat = 'Relationship Satisfaction'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
del Feat
Feat = 'Stock Option Level'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 25], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 16], H = 500, titleY = 0.90)
del Feat
Feat = 'Total Working Years'
Bins = [-1, 10, 20, 30, 41]
DistPlot(Feat = Feat, yLim = [0, 250], H = 450, titleY = 0.92)
Temp = FeatCut(Feat, Bins = Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX1(df = Temp, Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
Temp = FeatCut(Feat, Bins, ColorFeat = 'Education').replace({'(-1, 10]': '[0, 10]'})
PlotX2(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Temp, Feat
Feat = 'Training Times Last Year'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 12], H = 500, titleY = 0.90)
del Feat
Feat = 'Work Life Balance'
PlotX1(df = Table(Feat), Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
PlotX2(df = Table(Feat, ColorFeat = 'Education'), Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
del Feat
Feat = 'Years At Company'
Bins = [-1, 10, 20, 30, 41]
DistPlot(Feat, yLim = [0, 200], H = 450, titleY = 0.92)
Temp = FeatCut(Feat, Bins = Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX1(df = Temp, Feat = Feat, yLim = [0, 50], H = 500, titleY = 0.90)
Temp = FeatCut(Feat, Bins, ColorFeat = 'Education').replace({'(-1, 10]': '[0, 10]'})
PlotX2(df = Temp, Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Temp, Feat
Feat = 'Years In Current Role'
Bins = [-1, 4, 8, 12, 19]
DistPlot(Feat = Feat, yLim = [0, 400], H = 450, titleY = 0.92)
Temp = FeatCut(Feat, Bins = Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX1(df = Temp, Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
Temp = FeatCut(Feat, Bins, ColorFeat = 'Education').replace({'(-1, 4]': '[0, 4]'})
PlotX2(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Temp, Feat
Feat = 'Years Since Last Promotion'
Bins = [-1, 5, 10, 16]
DistPlot(Feat = Feat, yLim = [0, 600], H = 450, titleY = 0.92)
Temp = FeatCut(Feat, Bins = Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX1(df = Temp, Feat = Feat, yLim = [0, 50], H = 500, titleY = 0.90)
Temp = FeatCut(Feat, Bins, ColorFeat = 'Education').replace({'(-1, 5]': '[0, 5]'})
PlotX2(df = Temp, Feat = Feat, yLim = [0, 30], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Temp, Feat
Feat = 'Years With Current Manager'
Bins = [-1, 4, 8, 12, 18]
DistPlot(Feat = Feat, yLim = [0, 400], H = 450, titleY = 0.92)
Temp = FeatCut(Feat, Bins = Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX1(df = Temp, Feat = Feat, yLim = [0, 35], H = 500, titleY = 0.90)
Temp = FeatCut(Feat, Bins, ColorFeat = 'Education').replace({'(-1, 4]': '[0, 4]'})
PlotX2(df = Temp, Feat = Feat, yLim = [0, 20], H = 500, titleY = 0.90)
# For Preprocessing
df[Feat] = ColBins(Inp = df[Feat], Bins = Bins)
del Temp, Feat
Now
_ = Data_Plot(Data)
_ = Distinct_Observations(Inp = df, Featured_Columns = Featured_Columns, YL = 10)
In the dataset, Attrition represents whether an employee is churned or not. We would like to create a predictive model that predicts this feature.
We need to convert categorical data to numeric data.
def dtypes_group(Inp):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out = Out.reset_index(drop = False).rename(columns = {'index':'Data Type'})
Out['Data Type'] = Out['Data Type'].astype(str)
return Out
def dtype_sep(Inp):
Temp = Inp.dtypes.reset_index(drop = False)
Temp.columns = ['Features', 'Data Type']
Temp['Data Type'] = Temp['Data Type'].astype(str)
# Numeric_Columns
Numeric_Columns = Temp.loc[Temp['Data Type'].isin(['int64', 'int32', 'float64', 'float32']),'Features'].tolist()
# Categorical_Columns
Categorical_Columns = Temp.loc[Temp['Data Type'] == 'object','Features'].tolist()
return Numeric_Columns, Categorical_Columns
Numeric_Columns, Categorical_Columns = dtype_sep(Data)
display(dtypes_group(Data).style.hide_index())
| Data Type | Features | Count |
|---|---|---|
| int64 | ['Age', 'Years In Current Role', 'Years At Company', 'Training Times Last Year', 'Total Working Years', 'Stock Option Level', 'Standard Hours', 'Percent Salary Hike', 'Number Of Companies Worked', 'Monthly Rate', 'Monthly Income', 'Years Since Last Promotion', 'Job Level', 'Years With Current Manager', 'Hourly Rate', 'Daily Rate', 'Employee Number', 'Employee Count', 'Distance From Home'] | 19 |
| object | ['Attrition', 'Business Travel', 'Work Life Balance', 'Department', 'Education', 'Relationship Satisfaction', 'Performance Rating', 'Education Field', 'Over Time', 'Over 18', 'Environment Satisfaction', 'Gender', 'Job Satisfaction', 'Job Role', 'Job Involvement', 'Marital Status'] | 16 |
We can use LabelEncoder for converting categorical to numeric using. Therefore,
N = len(Categorical_Columns)
# Progressbar
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval= N, widgets=[progressbar.Bar('=', '|', '|'), progressbar.Percentage()])
#--------------- the loop ----------------------
Progress_Bar.start()
for i in range(N):
le = preprocessing.LabelEncoder()
le.fit(list(df[Categorical_Columns[i]]))
df[Categorical_Columns[i]] = le.transform(df[Categorical_Columns[i]])
del le
Progress_Bar.update(Counter)
Counter+=1
Progress_Bar.finish()
#--------------- End of the loop ---------------
# Finally, converting values of df back to integers.
df = df.astype(int)
display(dtypes_group(df).style.hide_index())
|=========================================================================|100%
| Data Type | Features | Count |
|---|---|---|
| int32 | ['Age', 'Monthly Rate', 'Number Of Companies Worked', 'Over 18', 'Over Time', 'Percent Salary Hike', 'Performance Rating', 'Monthly Income', 'Relationship Satisfaction', 'Stock Option Level', 'Total Working Years', 'Training Times Last Year', 'Work Life Balance', 'Years At Company', 'Years In Current Role', 'Standard Hours', 'Years Since Last Promotion', 'Marital Status', 'Job Role', 'Attrition', 'Business Travel', 'Daily Rate', 'Department', 'Distance From Home', 'Education', 'Job Satisfaction', 'Education Field', 'Employee Number', 'Environment Satisfaction', 'Gender', 'Hourly Rate', 'Job Involvement', 'Job Level', 'Employee Count', 'Years With Current Manager'] | 35 |
First, we remove features that have zero variance as these features don't add anything to our modeling.
# var
Temp = df[Featured_Columns].var().sort_values(ascending = False)
Temp = Temp.loc[Temp.round(16) ==0].index.tolist()
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Features with variance zero' + Style.RESET_ALL + ':' + '%s' % ', '.join(Temp))
df = df.drop(columns = Temp)
del Temp
Features with variance zero:Employee Count, Over 18, Standard Hours
Aditional_Columns = [Target, 'Employee Number']
X = df.drop(columns = Aditional_Columns)
y = df[Target]
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
# scaling data
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns =X.columns)
del scaler
fig, ax = plt.subplots(2, 1, figsize=(18, 8))
ax = ax.ravel()
font = FontProperties()
font.set_weight('bold')
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", 20)]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=20, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 6},
cbar_kws=kws)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], fontproperties=font, fontsize = 16)
del Temp
del CP, Names, ax, fig, font, Sets, kws
Modifying dataset.
df[X.columns.tolist()] = X_std[X.columns.tolist()]
df.to_csv (Path.split(".")[0]+'_STD.csv', index = None, header=True)